uses models generated in RandomForestWeather to generate future weather parameters Weather paraemeters joined with datetime parameters are used for prediction of
# Import pandas, numpy, matplotlib, seaborn libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from tabulate import tabulate
import datetime
# Using sklearn to split data into training and testing sets,train classifier and regressor models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
#pickle package saves and loads sklearn models
import pickle
# hide ipykernel warnings
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# Read in data as pandas dataframe and display first 5 rows
bikeDynamic = pd.read_csv('dBikeD.csv')
bikeDynamic = bikeDynamic.drop(columns=['last_update', 'status'])
bikeDynamic.tail(5)
print('The shape of our features is:', bikeDynamic.shape,'\n', bikeDynamic.dtypes)
bikeDynamic.isnull().sum()
# Read in data as pandas dataframe and display first 5 rows
weatherDynamic = pd.read_csv('dWeatherD.csv')
bikeDynamic.reset_index(drop=True)
weatherDynamic = weatherDynamic[['main_temp','main_feels_like', 'main_pressure', 'main_humidity', \
'main_temp_min', 'main_temp_max', 'wind_speed', 'wind_deg', 'data_entry_timestamp']]
weatherDynamic.tail(5)
print('The shape of our features is:', weatherDynamic.shape,'\n', weatherDynamic.dtypes)
weatherDynamic.isnull().sum()
weatherDynamic.loc[weatherDynamic['wind_deg'].isnull(),('wind_deg')] = weatherDynamic['wind_deg'].ffill()
weatherDynamic.isnull().sum()
Observation state that results for date 28 February and 24 March are patial. Hence, they can be dropped for data consistency. Its upto discration of user whether to incorporate it for training or not.
#DATETIME DATA
# Select columns containing datetime data
continous_date_columns = bikeDynamic[['data_entry_timestamp']].columns
# Assign object type datetime to columns enlisted in continous_date_columns
for column in continous_date_columns:
bikeDynamic[column] = pd.to_datetime(bikeDynamic[column])
# end_date = pd.to_datetime('2020-03-24')
# start_date = pd.to_datetime('2020-02-28')
# # Ommit partial data for dates 28-2-2020 and 24-3-2020
# mask = (bikeDynamic['data_entry_timestamp'].dt.date > start_date) & (bikeDynamic['data_entry_timestamp'].dt.date < end_date)
# bikeDynamic = bikeDynamic.loc[mask]
# # resolution to minutes
bikeDynamic['data_entry_timestamp'] = bikeDynamic['data_entry_timestamp'].dt.strftime("%Y-%m-%d %H:%M:00")
for column in continous_date_columns:
bikeDynamic[column] = pd.to_datetime(bikeDynamic[column])
bikeDynamic.tail()
print('The shape of our features is:', bikeDynamic.shape,'\n', bikeDynamic.dtypes)
#DATETIME DATA
# Select columns containing datetime data
continous_date_columns = weatherDynamic[['data_entry_timestamp']].columns
# Assign object type datetime to columns enlisted in continous_date_columns
for column in continous_date_columns:
weatherDynamic[column] = pd.to_datetime(weatherDynamic[column])
# end_date = pd.to_datetime('2020-03-24')
# start_date = pd.to_datetime('2020-02-28')
# # Ommit partial data for dates 28-2-2020 and 24-3-2020
# mask = (weatherDynamic['data_entry_timestamp'].dt.date > start_date) & (weatherDynamic['data_entry_timestamp'].dt.date < end_date)
# weatherDynamic = weatherDynamic.loc[mask]
# for column in continous_date_columns:
# weatherDynamic[column] = pd.to_datetime(weatherDynamic[column])
weatherDynamic.tail()
print('The shape of our features is:', weatherDynamic.shape,'\n', weatherDynamic.dtypes)
bikeDynamic.head()
weatherDynamic.head()
dfML = pd.merge(bikeDynamic, weatherDynamic, on='data_entry_timestamp')
print(dfML.shape)
dfML.head()
print('The shape of our features is:', dfML.shape,'\n', dfML.dtypes)
#Drop 'last_update' feature
dfML['year'] = dfML.data_entry_timestamp.dt.year
dfML['Day'] = dfML.data_entry_timestamp.dt.day
dfML['month'] = dfML.data_entry_timestamp.dt.month
dfML['hour'] = dfML.data_entry_timestamp.dt.hour
dfML['minute'] = dfML.data_entry_timestamp.dt.minute
dfML['dayOfWeek'] = dfML.data_entry_timestamp.dt.weekday
Data integrity is checked for following cases:
test_1 = dfML[["id_Entry","available_bike_stands","available_bikes","bike_stands"]][dfML["available_bike_stands"].add(dfML["available_bikes"], axis=0) >\
dfML["bike_stands"]]
print("Number of rows failing the test: ", test_1.shape[0])
test_1.head(5)
dTest_1 = test_1["id_Entry"]
for data in dTest_1:
dfML.loc[(dfML.id_Entry == data),'bike_stands'] = dfML.loc[(dfML.id_Entry == data),'available_bikes'] + dfML.loc[(dfML.id_Entry == data),'available_bike_stands']
test_1 = dfML[["id_Entry","available_bike_stands","available_bikes","bike_stands"]][dfML["available_bike_stands"].add(dfML["available_bikes"], axis=0) >\
dfML["bike_stands"]]
print("Number of rows failing the test: ", test_1.shape[0])
test_1.head(5)
dfML = dfML.drop(['bike_stands','id_Entry', 'data_entry_timestamp'], axis = 1)
dfML.reset_index(drop = True)
print('The shape of our features is:', dfML.shape,'\n', dfML.dtypes)
# Assign data type category to columns listed in categorical_columns
categorical_columns = dfML[['year', 'month', 'dayOfWeek']].columns
for column in categorical_columns:
dfML[column] = dfML[column].astype('category')
print('The shape of our features is:', dfML.shape)
print(dfML.dtypes)
dfML.head()
dfML_enc = pd.get_dummies(dfML, prefix_sep='_')
# X head
dfML_enc.head()
dfML_enc.to_csv("FeaturesGetDummies.csv")
print('Shape of features after one-hot encoding:', dfML_enc.shape)
# return dataframe without target variable for processing
def prepareDF(dfML, targetCol):
target = np.array(dfML[targetCol])
# Remove the target variable from dataframe
dfML= dfML.drop(targetCol, axis = 1)
# Saving feature names for later use
feature_list = list(dfML.columns)
# Convert to numpy array
dfML_array = np.array(dfML)
return dfML_array, target, feature_list
def trainTestData(features, target):
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, target, test_size = 0.25,
random_state = 42)
return train_features, test_features, train_labels, test_labels
def trainRandomForest(train_features, train_labels):
# Instantiate model
rf = RandomForestRegressor(n_estimators= 100, random_state=42)
# Train the random forest model
rf.fit(train_features, train_labels)
return rf
def predictRandomForest(rf, test_features):
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
return predictions
def result(predictions,test_labels, targetCol, station):
# Save results to CSV
with open('result_'+targetCol+'.csv','a') as resultCSV:
resultCSV.write(str(station)+",")
resultCSV.write(str(metrics.mean_absolute_error(test_labels, predictions))+",")
resultCSV.write(str(metrics.mean_squared_error(test_labels, predictions))+",")
resultCSV.write(str(metrics.mean_squared_error(test_labels, predictions, squared=False))+",")
resultCSV.write(str(metrics.r2_score(test_labels, predictions))+"\n")
resultCSV.close()
print('\n==============================================================================')
print("Station",station,"\t",targetCol)
print("MAE: ",metrics.mean_absolute_error(test_labels, predictions))
print("MSE: ",metrics.mean_squared_error(test_labels, predictions))
print("RMSE: ",metrics.mean_squared_error(test_labels, predictions, squared=False))
print("R2: ",metrics.r2_score(test_labels, predictions),"\n")
resultCSV.close()
plt.plot(test_labels[:10])
# plt.xticks(range(len(dates)), dates)
plt.plot(predictions[:10])
plt.xlabel('Time')
plt.ylabel(targetCol)
plt.show()
def show_Importance(rf,features):
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(features, round(importance, 2)) for features, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]
# Data processing pipeline
targetCols = ['available_bikes','available_bike_stands']
# ***************************************************************************************************************
stations = dfML_enc['number'].unique()
with open('result_available_bikes.csv','a') as resultCSV:
resultCSV.write("Station,MAE, MSE,RMSE,R2\n")
with open('result_available_bike_stands.csv','a') as resultCSV:
resultCSV.write("Station,MAE, MSE,RMSE,R2\n")
for station in stations:
dfML_station = dfML_enc.loc[dfML_enc['number'] == station]
dfML_enc_DA = dfML_station
for targetCol in targetCols:
# Array of target variables
target = np.array(dfML_enc_DA[targetCol])
dfML_pipeline = dfML_enc_DA.drop(['available_bikes','available_bike_stands','number'], axis = 1)
# Saving input feature names for later use
feature_list = list(dfML_pipeline.columns)
# Convert to numpy array
features = np.array(dfML_pipeline)
train_features, test_features, train_labels, test_labels = trainTestData(features, target)
rfBikeModel = trainRandomForest(train_features, train_labels)
# save the model
filename = 'stn'+str(station)+"_"+targetCol+'.pkl'
pickle.dump(rfBikeModel, open(filename, 'wb'))
# load the model
rfBikeModel_loaded = pickle.load(open(filename, 'rb'))
predictions = predictRandomForest(rfBikeModel_loaded,test_features)
result(predictions,test_labels, targetCol,station)
# show_Importance(rfBikeModel_loaded,features)